
# Graphical parameters ----------------------------------------------------

my_vars = c("cog", "noncog", "consci", "extro", "educ_y", "leader_conf", "jock_mean", "nerd_mean", "johted", 
            "energy", "social", "duty", "delib", "achieve", "masc", "leader", "conf", "visuos", "words", "math",
            "cog:noncog", "cog:extro", "cog:consci", "extro:consci", 
            "cog:educ_y", "educ_y:noncog", "educ_y:extro", "educ_y:consci",
            "noncog_noise1", "noncog_noise2", "noncog_noise3", "noncog_noise4")

# ggplot aesthetic mappings
var_names = c("cog" = "Cognitive", "noncog" = "Noncognitive", "leader_conf" = "Leadership+Confidence",
              "consci" = "Conscientiousness", "extro" = "Extraversion",
              "educ_y" = "Education Years",
              "cog:noncog" = "Cog*Noncog", "cog:extro" = "Cog*Extra", "cog:consci" = "Cog*Consci", "extro:consci" = "Extra*Consci",
              "cog:educ_y" = "Cog*EducYears", "educ_y:noncog" = "Noncog*EducYears", "educ_y:extro" = "Extra*EducYears", "educ_y:consci" = "Consci*EducYears",
              "noncog_noise1" = "Noncog + 2%SD Yearly Noise", "noncog_noise2" = "Noncog + 4%SD Yearly Noise",
              "noncog_noise3" = "Noncog + 20%SD Flat Noise", "noncog_noise4" = "Noncog + 100%SD Flat Noise",
              "1_1" = "Both Low", "1_3" = "Low Extra, High Cognitive", "3_1" = "High Extra, Low Cognitive", "3_3" = "Both High")
line_types = c("cog" = "solid", "noncog" = "longdash", "consci" = "twodash", "extro" = "dotted", "leader_conf" = "longdash",
               "educ_y" = "dotdash",
               "cog:noncog" = "dotted", "cog:extro" = "longdash", "cog:consci" = "dotdash", "extro:consci" = "dashed",
               "cog:educ_y" = "dotted", "educ_y:noncog" = "longdash", "educ_y:extro" = "dotdash", "educ_y:consci" = "dashed",
               "noncog_noise1" = "longdash", "noncog_noise2" = "longdash", "noncog_noise3" = "longdash", "noncog_noise4" = "longdash",
               "1_1" = "solid", "1_3" = "longdash", "3_1" = "twodash", "3_3" = "dotted")
point_types = c("cog" = 15, "noncog" = 16, "consci" = 17, "extro" = 18, "educ_y" = 25, "leader_conf" = 8,
                "cog:noncog" = 17, "cog:extro" = 25, "cog:consci" = 13, "extro:consci" = 14,
                "cog:educ_y" = 17, "educ_y:noncog" = 25, "educ_y:extro" = 13, "educ_y:consci" = 14,
                "noncog_noise1" = 16, "noncog_noise2" = 16, "noncog_noise3" = 16, "noncog_noise4" = 16,
                "1_1" = 15, "1_3" = 16, "3_1" = 17, "3_3" = 18)
color_types = c("cog" = "steelblue", "noncog" = "darkorange", "consci" = "forestgreen", "extro" = "firebrick", 
                "educ_y" = "purple", "leader_conf" = "coral",
                "cog:noncog" = "forestgreen", "cog:extro" = "darkorange", "cog:consci" = "coral", "extro:consci" = "purple",
                "cog:educ_y" = "forestgreen", "educ_y:noncog" = "darkorange", "educ_y:extro" = "coral", "educ_y:consci" = "purple",
                "noncog_noise1" = "darkorange", "noncog_noise2" = "darkorange", "noncog_noise3" = "darkorange", "noncog_noise4" = "darkorange",
                "1_1" = "steelblue1", "1_3" = "steelblue2", "3_1" = "steelblue3", "3_3" = "steelblue4")

# Trend plot function
trendsPlot = function(my_data, my_model){
  nest_mincer = my_data %>% 
    #filter(inc_prop > 8) %>% 
    group_by(vuosi) %>% nest %>% 
    mutate(mincer1 = map(data, ~feols(as.formula(paste0(my_model," | synvuosi")), data = ., vcov = "hetero")),
    tidied1 = map(mincer1, ~tidy(., conf.int = T)))
  
  nest_mincer_unnest = nest_mincer %>% 
    select(tidied1) %>% 
    unnest(cols = c(tidied1)) %>% ungroup %>% 
    filter(term %in% my_vars)
  
  nest_mincer_unnest %>% 
    ggplot(aes(x = vuosi, y = estimate, color = as.factor(term), shape = as.factor(term), linetype = as.factor(term))) +
    geom_point(size = 3) +
    geom_line(linewidth = 0.6) +
    geom_ribbon(aes(ymin = conf.low, ymax = conf.high, fill = as.factor(term)), alpha = 0.1, color = NA)  +
    theme_bw() +
    scale_color_manual(name = "Coefficient", labels = var_names, values = color_types) +
    scale_fill_manual(name = "Coefficient", labels = var_names, values = color_types) +
    scale_shape_manual(name = "Coefficient", labels = var_names, values = point_types) +
    scale_linetype_manual(name = "Coefficient", labels = var_names, values = line_types) +
    labs(x = "Year",
         y = "Estimate") +
    scale_y_continuous(breaks = pretty) +
    scale_x_continuous(breaks = 2000:2015, labels = paste0("'", str_sub(2000:2015, 3, 4))) +
    guides(fill = FALSE)
}

# Functions ---------------------------------------------------------------


library(crayon)
library(tidylog)
crayon = function(x) cat(green(x), sep = "\n")
options("tidylog.display" = list(crayon))

# Specify lh side and rh side separately to create a formula to use in regressions
my_formula = function(x,y){paste0(x, "~", y) %>% as.formula()}

# Leave out mean
leaveOutMean = function(x){
  sapply(1:length(x), function(i)(mean(x[-i], na.rm = T)))
}

# Calculate outcome mean from an lm object
out_mean = function(x)round(mean(x$response), digits = 2)


# Calculate the mode of a vector
myMode = function(x){
 values = sort(unique(x))
 values[which.max(tabulate(match(x, values)))]
}

# Normalize
myScale = function(x)as.vector(scale(x))


# A function that converts numeric vector into percentile ranking. Normalizes mean to 50%.
percentile0100 = function(x){
  tibble(x = as.numeric(x), original_order = seq_along(x)) %>% 
    arrange(x) %>% # ascending order, puts NAs last
    mutate(perc = (seq_along(x)-1)/(sum(!is.na(x))-1), # percentile ranking
           perc = ave(perc, x), #average over equal discreet values of x
           perc = ifelse(is.na(x), NA, perc)) %>% # impute NAs
    arrange(original_order) %>% # return to original order
    pull(perc)*100 # return a vector scaled to 0-100
}

# Calculate number of missing values by column
n_missing = function(df, ..., frac = T){
  require(dplyr)
  #options(scipen = 999)
  n = length(as.list(substitute(list(...))))
  var_names = if(n == 1) {
    names(df)
  } else {
    names(select(df, ...))
  }
  missing_values = if(n == 1) {
    apply(is.na(df), 2, sum)
  } else {
    apply(is.na(select(df, ...)), 2, sum)
  }
  if(frac == T){missing_values = missing_values/nrow(df)*100}
  round(missing_values, digits = 1)
} %>% data.frame %>% mutate(variable = var_names) %>% select_("variable", "n" = ".") %>% arrange(-n)

# Unique levels by column
n_distinct2 = function(df, ...){
  require(dplyr)
  n = length(as.list(substitute(list(...))))
  if(n == 1) {
    apply(df, 2, n_distinct)
  } else {
    apply(select(df, ...), 2, n_distinct) 
  }
}

# Order ggplot labels to match ranking of curves
# order_labels = function(data, x, y, y_value = max(data$y_value), group) {
#   data %>% 
#     filter(y == y_value) %>% 
#     group_by(group) %>% 
#     summarise(avg = mean(x, na.rm = T)) %>%  
#     arrange(desc(avg)) %>% 
#     pull(group)
#   }

# myPercentile = function(x) 
# {
#   dat = parse_integer(x)
#   dat = dat[!is.na(dat)]
#   # Get the sample quantiles as fine as possible. Large vector. First an example:
#   quantile(c(1,99,100), probs = seq(0, 1, by = 0.1), type = 7, na.rm = T)
#   # When many people have the same score, the same values will be associated with multiple quantiles. 
#   pt1 <- quantile(dat, probs = seq(0, 1, by = 0.00001), type = 7, na.rm = T)
#   # Keep only unique values and their corresponding quantile. 
#   # fromLast = T keeps the largest quantile assosicated with a unique value.
#   # This means that individuals that share the same score with many others, will be given
#   # sorted to the best quantile assosiciated with that score.
#   pt2 <- unique(as.data.frame(pt1), fromLast = TRUE)
#   # Extract quantile names
#   pt3 <- rownames(pt2)
#   # Removes the percentage symbol
#   pt4 <- as.numeric(strsplit(pt3, "%"))
#   #???
#   datp <- pt4[as.numeric(cut(dat, breaks = c(-Inf, pt2$pt1), labels = 1:length(pt3)))]
#   output = rep(NA, length(x))
#   output[!is.na(x)] = datp
#   return(output)
# }


# load('data_output/ytl_1967_2013.Rdata')
# load('./data_output/ytlTutkintoSample_10k.Rdata')
# 
# 
# test = ytl %>% 
#   filter(ytl_aine == 'A',
#          ytl_vuosi_kausi == 1991.1) %>%
#   arrange(ytl_pisteet) %>% 
#   pull(ytl_pisteet)
# 
# test = ytlTutkintoSample %>% 
#   filter(aine == 'A',
#          tvuosi == 1991) %>%
#   arrange(pisteet) %>% 
#   pull(pisteet)
# 
# x = test
# 
# fun_test = myPercentile(test)
# fun_dplyr = round(cume_dist(test)*100, digits = 3)
# cume_dist
# fun_dplyr2 = rank(x, ties.method = "min", na.last = "keep")/sum(!is.na(x))
# fun_dplyr2 = round(cume_dist(fun_dplyr2)*100, digits = 3)
# 
# data.frame(my = fun_test, dplyrs = fun_dplyr, dplyrs2 = fun_dplyr2, points =test)
# 
# length(unique(test))
# length(unique(fun_test))
# unique(test)
# unique(fun_test)
# table(test, useNA = 'always')
# table(fun_test, useNA = 'always')
# 
# median(fun_test, na.rm = T)